customer_id and customer_unique_id?Customer_unique_id and list paires with
Custumer_idCustomer_unique_id and their
zip_code_prefixThis dataset was generously provided by Olist, the largest department store in Brazilian marketplaces. Olist connects small businesses from all over Brazil to channels without hassle and with a single contract. Those merchants are able to sell their products through the Olist Store and ship them directly to the customers using Olist logistics partners. See more on our website: www.olist.com.
The data set comes from kaggle competition named Brazilian E-Commerce Public Dataset by Olist.
# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()## python: /usr/bin/python3.10
## libpython: /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome: //usr://usr
## version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy: /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version: 1.26.0
##
## NOTE: Python version was forced by use_python() function
# Load needed R packages
library(tidyverse)
library(data.table)
library(leaflet)
library(stringr)
#require(maps)
library(ggplot2)
#require(mapview)
#require(scales)
#require(RColorBrewer)
require(lubridate)
#library(plotly)
#require(gganimate)
#require(gifski)
#library(caret)
library(DT)
library(kableExtra)customers <- fread("data/olist_customers_dataset.csv")
geolocation <- fread("data/olist_geolocation_dataset.csv")
order_items <- fread("data/olist_order_items_dataset.csv")
order_payments <- fread("data/olist_order_payments_dataset.csv")
order_reviews <- fread("data/olist_order_reviews_dataset.csv")
orders <- fread("data/olist_orders_dataset.csv")
products <- fread("data/olist_products_dataset.csv")
sellers <- fread("data/olist_sellers_dataset.csv")
category_translation <- fread("data/product_category_name_translation.csv")Compute:
\[Volume = product\_length\_cm * product\_height\_cm * product\_width\_cm / 1000\]
Mutate volume range
Mutate weight range
## check for missing value in specific columns
product_id_with_missed_data <-
products |>
select(product_id,product_category_name, product_weight_g,
product_length_cm, product_height_cm, product_width_cm) |>
filter_all(any_vars(is.na(.)))
product_id_with_missed_data |>
knitr::kable() |> kable_styling() |>
scroll_box(width = "900px", height = "300px") | product_id | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm |
|---|---|---|---|---|---|
| 09ff539a621711667c43eba6a3bd8466 | bebes | NA | NA | NA | NA |
| 5eb564652db742ff8f28759cd8d2652a | NA | NA | NA | NA |
sellers_missing_data <-
order_items |>
filter(product_id %in% product_id_with_missed_data$product_id) %>%
group_by(product_id, seller_id, shipping_limit_date, price, freight_value) %>%
count() %>%
arrange(shipping_limit_date)
sellers_missing_data %>%
datatable()sellers_missing_data %>%
#mutate(total= price+ freight_value) %>%
pivot_longer(-c(product_id, seller_id,n,shipping_limit_date)) %>%
ggplot(aes(x = shipping_limit_date, y= value, fill = name)) +
geom_area()+
facet_wrap(~product_id, scales = "free_y")
* Increase of the price during the period of study.
## add english name and weight range catagories
products <- products |>
left_join(y = category_translation, by = "product_category_name") |>
mutate(`volume dm³` = product_length_cm * product_height_cm * product_width_cm/1000) %>%
rename(weight = product_weight_g) %>%
mutate(`weight kg` = weight/1000)
products |>
head(1000) |>
datatable()products <- products |>
mutate(weight_range = if_else(`weight kg` <= 1,"0-1",
if_else(`weight kg`> 1 & `weight kg` <= 5, "1-5 kg",
if_else(`weight kg` > 5 &`weight kg` <=10, "5-10 kg",
if_else(`weight kg` > 10 & `weight kg` <= 20, "10-20 kg",
if_else(`weight kg` > 20 & `weight kg` <= 30, "20-30 kg",
if_else(`weight kg` > 30 & `weight kg` <= 42, "30-42 kg", NA )))))))
products |>
head(1000) |>
datatable()## glimpse the density of products volumes
products |>
drop_na(`volume dm³`, weight_range) |>
ggplot() +
aes(x=`volume dm³`, color= weight_range, fill= weight_range, na.rm = TRUE)+
geom_density(adjust= 10, alpha=0.1)## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## add volume range
products <- products |>
mutate(volume_range = if_else(`volume dm³` <= 1,"0-1 dm³",
if_else(`volume dm³`> 1 & `volume dm³` <= 5, "1-5 dm³",
if_else(`volume dm³` > 5 &`volume dm³` <=10, "5-10 dm³",
if_else(`volume dm³` > 10 & `volume dm³` <= 20, "10-20 dm³",
if_else(`volume dm³` > 20 & `volume dm³` <= 30, "20-30 dm³",
if_else(`volume dm³` > 30 , ">30 dm3", NA )))))))
products |> head() |>
datatable()## list categories
products %>%
#group_by(product_category_name_english, product_name_lenght) |>
#mutate(Qt=n()) |>
pull(product_category_name_english) |>
unique() |> as.data.frame() |>
rownames_to_column(var = "index") |>
knitr::kable() |> kable_styling() |>
scroll_box(width = "300px", height = "500px")| index | unique(products %>% pull(product_category_name_english)) |
|---|---|
| 1 | perfumery |
| 2 | art |
| 3 | sports_leisure |
| 4 | baby |
| 5 | housewares |
| 6 | musical_instruments |
| 7 | cool_stuff |
| 8 | furniture_decor |
| 9 | home_appliances |
| 10 | toys |
| 11 | bed_bath_table |
| 12 | construction_tools_safety |
| 13 | computers_accessories |
| 14 | health_beauty |
| 15 | luggage_accessories |
| 16 | garden_tools |
| 17 | office_furniture |
| 18 | auto |
| 19 | electronics |
| 20 | fashion_shoes |
| 21 | telephony |
| 22 | stationery |
| 23 | fashion_bags_accessories |
| 24 | computers |
| 25 | home_construction |
| 26 | watches_gifts |
| 27 | construction_tools_construction |
| 28 | pet_shop |
| 29 | small_appliances |
| 30 | agro_industry_and_commerce |
| 31 | NA |
| 32 | furniture_living_room |
| 33 | signaling_and_security |
| 34 | air_conditioning |
| 35 | consoles_games |
| 36 | books_general_interest |
| 37 | costruction_tools_tools |
| 38 | fashion_underwear_beach |
| 39 | fashion_male_clothing |
| 40 | kitchen_dining_laundry_garden_furniture |
| 41 | industry_commerce_and_business |
| 42 | fixed_telephony |
| 43 | construction_tools_lights |
| 44 | books_technical |
| 45 | home_appliances_2 |
| 46 | party_supplies |
| 47 | drinks |
| 48 | market_place |
| 49 | la_cuisine |
| 50 | costruction_tools_garden |
| 51 | fashio_female_clothing |
| 52 | home_confort |
| 53 | audio |
| 54 | food_drink |
| 55 | music |
| 56 | food |
| 57 | tablets_printing_image |
| 58 | books_imported |
| 59 | small_appliances_home_oven_and_coffee |
| 60 | fashion_sport |
| 61 | christmas_supplies |
| 62 | fashion_childrens_clothes |
| 63 | dvds_blu_ray |
| 64 | arts_and_craftmanship |
| 65 | furniture_bedroom |
| 66 | cine_photo |
| 67 | diapers_and_hygiene |
| 68 | flowers |
| 69 | home_comfort_2 |
| 70 | security_and_services |
| 71 | furniture_mattress_and_upholstery |
| 72 | cds_dvds_musicals |
## count how many products by category and count how many products with the same description length
products %>%
#filter(product_category_name== "cama_mesa_banho") |>
#mutate(product_description_lenght= as.factor(product_description_lenght)) |>
group_by(product_category_name_english, product_name_lenght, product_description_lenght) |>
#mutate(Qt= n()) |>
summarise(products_with_same_name_lenght = n(), .groups='keep') |>
group_by(product_description_lenght) |>
#filter(product_description_lenght=="93") |>
mutate(products_with_same_description_lenght = n()) |>
#head(1000) |>
#arrange(desc(Qt)) |>
#filter(product_category_name_english %in% c("baby", "housewares")) |>
arrange(desc(products_with_same_name_lenght)) |>
head(100) |>
datatable()There are 610 products without description
We assume that products from the same category and have different name lenght are different.
We assume that products with the same name lenght and different description lenght are same bu from different sellers.
group_pdt_sellers <-
order_items |>
group_by(product_id, seller_id) |>
summarise(Freq= n(), .groups="keep") |>
arrange(desc(Freq))
group_pdt_sellers |>
head(100) |>
datatable()products |>
left_join(group_pdt_sellers[-3] , by = "product_id") |>
select( product_category_name_english,product_name_lenght, seller_id, product_id) |>
group_by(product_category_name_english, product_name_lenght, seller_id) |>
mutate(products_with_same_name_lenght = n()) |>
arrange(desc(products_with_same_name_lenght)) |>
head(500) |>
datatable()We find which seller has missing product name:
e5a3438891c0bfdb9394643f95273d8e. We can find his
geolocation.
Several items with the same product length name exist for the same seller. It seems the same product but with different color or version or option.
It is possible that products belonging to different categories have the same name length and sold by different sellers.
CONCLUSION: The product_name_lenght is
not specific to a specific product.
products |>
drop_na(`weight kg`, `volume dm³`, `product_category_name_english`) |>
ggplot() +
aes(y = `weight kg` , x = `product_category_name_english`, color= weight_range) +#, size=`volume dm³`
geom_point() +
theme(legend.position = "left", #axis.text.y = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
) +
#facet_wrap(~weight_range, nrow = 6, scales = "free_y")+
facet_wrap(~volume_range, nrow = 6, scales= "free_y")
* We can note some products with small volume but with high weigth and
vis-versa.
## join items and orders and products
detail_orders <- order_items |>
left_join(orders, by = "order_id") |>
left_join(products|> select(product_id, product_category_name_english), by = "product_id") |>
select(order_id, order_item_id, product_id, product_category_name_english,seller_id, customer_id, everything())
detail_orders|>
head(1000) |>
datatable()## group by order and check items
orders_customer <- detail_orders |>
group_by( order_id, customer_id, product_category_name_english) |>
summarise(n_items= n(), .groups = "keep") |>
group_by( order_id, customer_id, product_category_name_english, n_items) |>
summarise(n_orders= n(), .groups = "keep") |>
arrange(desc(n_items))
#arrange(desc(n_orders))
orders_customer |>
head(1000) |>
datatable()## check how many customers
n_customers <- detail_orders |>
distinct(customer_id) |>
count() |>
pull()
## check How many orders
n_orders <- detail_orders |>
distinct(order_id) |>
count() |>
pull()
print(paste0("there are ", n_customers, " Customers in this study"))## [1] "there are 98666 Customers in this study"
## [1] "there are 98666 Orders in this study"
## visualize categories that were purchased by customers with more than 6 items per order
orders_customer |>
filter(n_items > 6) |>
ggplot() +
aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= customer_id) +
geom_col() +
theme(legend.position = "bottom", axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
)orders_seller <- detail_orders |>
group_by( order_id, seller_id, product_category_name_english) |>
summarise(n_items= n(), .groups = "keep") |>
group_by( order_id,seller_id, product_category_name_english, n_items) |>
summarise(n_orders= n(), .groups = "keep") |>
arrange(desc(n_items))
#arrange(desc(n_orders))
orders_seller |>
head(1000) |>
datatable()orders_seller |>
filter(n_items > 6) |>
ggplot() +
aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= seller_id) +
geom_col() +
theme(legend.position = "bottom", axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
)customer_id and
customer_unique_id?## count customer_id
paste0("there is ",
customers |>
distinct(customer_id) |>
nrow(),
" Customer_id in this data")## [1] "there is 99441 Customer_id in this data"
paste0("there is ",
customers |> distinct(customer_unique_id) |> nrow(),
" Customer_unique_id in this data")## [1] "there is 96096 Customer_unique_id in this data"
paste0("There is about ",
customers |> distinct(customer_id) |> nrow() - customers |> distinct(customer_unique_id) |> nrow(),
" rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id " )## [1] "There is about 3345 rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id "
Customer_unique_id and list paires with
Custumer_id## What is the difference between customer_id and customer_unique_id
list_of_unique_id_with_multiple_customer_id <- customers |>
group_by(customer_unique_id) |>
## filter only duplicated
filter(n()>1) |>
arrange(desc(customer_unique_id))
paste0("Here is a sample of table with ",
list_of_unique_id_with_multiple_customer_id |> nrow(),
" rows. ")## [1] "Here is a sample of table with 6342 rows. "
customer_unique_id corresponding
to multiple customer_id.Customer_unique_id and their
zip_code_prefix## Filter only Zip code for customer_unique_id
customers |>
filter(customer_unique_id %in% duplicated_customer_unique_id) |>
group_by(customer_id, customer_unique_id, zip_code_prefix) |>
#distinct(customer_unique_id) |>
arrange(desc(customer_unique_id)) |>
filter(zip_code_prefix == 99750) |>
datatable()NO!
Only zip_code_prefix is not enought to distinct between geolocations of customers and sellers
For example there is only one customer with the zip_code equal to 99750.
But in geolicalisation we have 14 Positions related to this zip_code
## [1] "There are character(0)Duplited seller_id."
## [2] "There are integer(0)Duplited seller_id."
## [3] "There are character(0)Duplited seller_id."
## [4] "There are character(0)Duplited seller_id."
sellers_geolocation <- geolocation |>
filter(zip_code_prefix %in% sellers$zip_code_prefix) %>% ## IS NOT ENOUTH there are the same zip code for multiple sellers and customers.
left_join(y = sellers, by = "zip_code_prefix")
lng1 <- min(sellers_geolocation$Longitude)
lng2 <- max(sellers_geolocation$Longitude)
lat1 <- min(sellers_geolocation$Latitude)
lat2 <- max(sellers_geolocation$Latitude)
leaflet(sellers_geolocation) %>%
addTiles() %>%
setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
addRectangles(
lng1= lng1, lat1= lat1,
lng2= lng2, lat2=lat2,
fillColor = "transparent"
) %>%
addMarkers(~Longitude, ~Latitude,
popup = ~seller_city, label = ~seller_state,
clusterOptions = markerClusterOptions())customers_geolocation <- geolocation |>
#rename(zip_code_prefix= geolocation_zip_code_prefix) |>
filter(zip_code_prefix %in% unique(customers$zip_code_prefix)) |>
left_join(y = customers, by = "zip_code_prefix")
lng1 <- min(customers_geolocation$Longitude)
lng2 <- max(customers_geolocation$Longitude)
lat1 <- min(customers_geolocation$Latitude)
lat2 <- max(customers_geolocation$Latitude)
# leaflet(customers_geolocation) %>%
# addTiles() %>%
# setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
# addRectangles(
# lng1= lng1, lat1= lat1,
# lng2= lng2, lat2=lat2,
# fillColor = "transparent"
# ) %>%
# addMarkers(~Longitude, ~Latitude,
# popup = ~customer_city, label = ~customer_state,
# clusterOptions = markerClusterOptions())